import re
import regex
import pandas as pd
import numpy as np
import emoji
import plotly.express as px
from collections import Counter
import matplotlib.pyplot as plt
from os import path
from PIL import Image
import datetime
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
%matplotlib inline
def startsWithDateAndTimeAndroid(s):
pattern = '^([0-9]+)(\/)([0-9]+)(\/)([0-9]+), ([0-9]+):([0-9]+)[ ]?(AM|PM|am|pm)? -'
result = re.match(pattern, s)
if result:
return True
return False
def startsWithDateAndTimeios(s):
pattern = '^\[([0-9]+)([\/-])([0-9]+)([\/-])([0-9]+)[,]? ([0-9]+):([0-9][0-9]):([0-9][0-9])?[ ]?(AM|PM|am|pm)?\]'
result = re.match(pattern, s)
if result:
return True
return False
def FindAuthor(s):
s=s.split(":")
if len(s)==2:
return True
else:
return False
def getDataPointAndroid(line):
splitLine = line.split(' - ')
dateTime = splitLine[0]
date, time = dateTime.split(', ')
message = ' '.join(splitLine[1:])
if FindAuthor(message):
splitMessage = message.split(':')
author = splitMessage[0]
message = ' '.join(splitMessage[1:])
else:
author = None
return date, time, author, message
def getDataPointios(line):
splitLine = line.split('] ')
dateTime = splitLine[0]
if ',' in dateTime:
date, time = dateTime.split(',')
else:
date, time = dateTime.split(' ')
message = ' '.join(splitLine[1:])
if FindAuthor(message):
splitMessage = message.split(':')
author = splitMessage[0]
message = ' '.join(splitMessage[1:])
else:
author = None
if time[5]==":":
time = time[:5]+time[-3:]
else:
if 'AM' in time or 'PM' in time:
time = time[:6]+time[-3:]
else:
time = time[:6]
return date, time, author, message
def dateconv(date):
year=''
if '-' in date:
year = date.split('-')[2]
if len(year) == 4:
return datetime.datetime.strptime(date, "[%d-%m-%Y").strftime("%Y-%m-%d")
elif len(year) ==2:
return datetime.datetime.strptime(date, "[%d-%m-%y").strftime("%Y-%m-%d")
elif '/' in date:
year = date.split('/')[2]
if len(year) == 4:
return datetime.datetime.strptime(date, "[%d/%m/%Y").strftime("%Y-%m-%d")
if len(year) ==2:
return datetime.datetime.strptime(date, "[%d/%m/%y").strftime("%Y-%m-%d")
def split_count(text):
emoji_list = []
data = regex.findall(r'\X', text)
for word in data:
if any(char in emoji.UNICODE_EMOJI for char in word):
emoji_list.append(word)
return emoji_list
parsedData = [] # List to keep track of data so it can be used by a Pandas dataframe
conversationPath = "WhatsApp Chat with Data Science.txt"
with open(conversationPath, encoding="utf-8") as fp:
device=''
first=fp.readline()
print(first)
if '[' in first:
device='ios'
else:
device="android"
fp.readline()
messageBuffer = []
date, time, author = None, None, None
while True:
line = fp.readline()
if not line:
break
if device=="ios":
line = line.strip()
if startsWithDateAndTimeios(line):
if len(messageBuffer) > 0:
parsedData.append([date, time, author, ' '.join(messageBuffer)])
messageBuffer.clear()
date, time, author, message = getDataPointios(line)
messageBuffer.append(message)
else:
line= (line.encode('ascii', 'ignore')).decode("utf-8")
if startsWithDateAndTimeios(line):
if len(messageBuffer) > 0:
parsedData.append([date, time, author, ' '.join(messageBuffer)])
messageBuffer.clear()
date, time, author, message = getDataPointios(line)
messageBuffer.append(message)
else:
messageBuffer.append(line)
else:
line = line.strip()
if startsWithDateAndTimeAndroid(line):
if len(messageBuffer) > 0:
parsedData.append([date, time, author, ' '.join(messageBuffer)])
messageBuffer.clear()
date, time, author, message = getDataPointAndroid(line)
messageBuffer.append(message)
else:
messageBuffer.append(line)
if device =='android':
df = pd.DataFrame(parsedData, columns=['Date', 'Time', 'Author', 'Message'])
df["Date"] = pd.to_datetime(df["Date"])
df = df.dropna()
df["emoji"] = df["Message"].apply(split_count)
URLPATTERN = r'(https?://\S+)'
df['urlcount'] = df.Message.apply(lambda x: re.findall(URLPATTERN, x)).str.len()
else:
df = pd.DataFrame(parsedData, columns=['Date', 'Time', 'Author', 'Message']) # Initialising a pandas Dataframe.
df = df.dropna()
df["Date"] = df["Date"].apply(dateconv)
df["Date"] = pd.to_datetime(df["Date"],format='%Y-%m-%d')
df["emoji"] = df["Message"].apply(split_count)
URLPATTERN = r'(https?://\S+)'
df['urlcount'] = df.Message.apply(lambda x: re.findall(URLPATTERN, x)).str.len()
df.head()
df.tail()
df.Author.unique()
df = df.dropna()
df.info()
# Get Year from Date
df['Date'] = pd.to_datetime(df['Date'])
df['Year']=df['Date'].dt.year
# Get Month from Date
df['Month']=df['Date'].dt.month
# Get Day from Date
df['Day']=df['Date'].dt.day
# Get hour from Time
df['Hour'] = df['Time'].str[:2]
# Get Media shared in the Message
df['Media']=df['Message'].str.contains('<Media omitted>')
df.head()
total_messages = df.shape[0]
print(total_messages)
media_messages = df[df['Message'] == ' <Media omitted>'].shape[0]
print(media_messages)
media_messages =df[(df['Message'] == ' <Media omitted>')|(df['Message'] == ' image omitted')|(df['Message'] == ' video omitted')|(df['Message'] == ' sticker omitted')].shape[0]
print(media_messages)
emojis = sum(df['emoji'].str.len())
print(emojis)
URLPATTERN = r'(https?://\S+)'
df['urlcount'] = df.Message.apply(lambda x: re.findall(URLPATTERN, x)).str.len()
links = np.sum(df.urlcount)
links
print("Group Wise Stats")
print("Messages:",total_messages)
print("Media:",media_messages)
print("Emojis:",emojis)
print("Links:",links)
link_messages= df[df['urlcount']>0]
deleted_messages=df[(df["Message"] == " You deleted this message")| (df["Message"] == " This message was deleted.")|(df["Message"] == " You deleted this message.")]
media_messages_df = df[(df['Message'] == ' <Media omitted>')|(df['Message'] == ' image omitted')|(df['Message'] == ' video omitted')|(df['Message'] == ' sticker omitted')]
messages_df = df.drop(media_messages_df.index)
messages_df = messages_df.drop(deleted_messages.index)
messages_df = messages_df.drop(link_messages.index)
messages_df['Letter_Count'] = messages_df['Message'].apply(lambda s : len(s))
messages_df['Word_Count'] = messages_df['Message'].apply(lambda s : len(s.split(' ')))
messages_df["MessageCount"]=1
messages_df["emojicount"]= df['emoji'].str.len()
messages_df.head()
l = messages_df.Author.unique()
for i in range(len(l)):
# Filtering out messages of particular user
req_df= messages_df[messages_df["Author"] == l[i]]
# req_df will contain messages of only one particular user
print(f'Stats of {l[i]} -')
# shape will print number of rows which indirectly means the number of messages
print('Messages Sent', req_df.shape[0])
#Word_Count contains of total words in one message. Sum of all words/ Total Messages will yield words per message
words_per_message = (np.sum(req_df['Word_Count']))/req_df.shape[0]
print('Words per message', words_per_message)
#media conists of media messages
media = media_messages_df[media_messages_df['Author'] == l[i]].shape[0]
print('Media Messages Sent', media)
# emojis conists of total emojis
emojis = sum(req_df['emoji'].str.len())
print('Emojis Sent', emojis)
#links consist of total links
links = sum(link_messages[link_messages['Author'] == l[i]]["urlcount"])
print('Links Sent', links)
print()
total_emojis_list = list(set([a for b in messages_df.emoji for a in b]))
total_emojis = len(total_emojis_list)
print(total_emojis)
total_emojis_list = list([a for b in messages_df.emoji for a in b])
emoji_dict = dict(Counter(total_emojis_list))
emoji_dict = sorted(emoji_dict.items(), key=lambda x: x[1], reverse=True)
print(emoji_dict)
emoji_df = pd.DataFrame(emoji_dict, columns=['emoji', 'count'])
emoji_df.head(10)
import plotly.express as px
fig = px.pie(emoji_df, values='count', names='emoji', title= "Emoji's Used")
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()
l = messages_df.Author.unique()
for i in range(len(l)):
dummy_df = messages_df[messages_df['Author'] == l[i]]
total_emojis_list = list([a for b in dummy_df.emoji for a in b])
emoji_dict = dict(Counter(total_emojis_list))
emoji_dict = sorted(emoji_dict.items(), key=lambda x: x[1], reverse=True)
print('Emoji Distribution for', l[i])
author_emoji_df = pd.DataFrame(emoji_dict, columns=['emoji', 'count'])
fig = px.pie(author_emoji_df, values='count', names='emoji')
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()
def f(i):
l = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
return l[i];
day_df=pd.DataFrame(messages_df["Message"])
day_df['day_of_date'] = messages_df['Date'].dt.weekday
day_df['day_of_date'] = day_df["day_of_date"].apply(f)
day_df["messagecount"] = 1
day = day_df.groupby("day_of_date").sum()
day.reset_index(inplace=True)
fig = px.line_polar(day, r='messagecount', theta='day_of_date', line_close=True)
fig.update_traces(fill='toself')
fig.update_layout(
polar=dict(
radialaxis=dict(
visible=True,
)),
showlegend=False
)
fig.show()
date_df = messages_df.groupby("Date").sum()
date_df.reset_index(inplace=True)
fig = px.line(date_df, x="Date", y="MessageCount")
fig.update_xaxes(nticks=20)
# Edit the layout
#fig.update_layout(title='Name',
# xaxis_title='Name',
# yaxis_title='Name')
fig.show()
#fig.write_image("messages.png")
date_df["rolling"] = date_df["MessageCount"].rolling(30).mean()
fig = px.line(date_df, x="Date", y="rolling")
fig.update_xaxes(nticks=20)
fig.show()
auth = messages_df.groupby("Author").sum()
auth.reset_index(inplace=True)
fig = px.bar(auth, y="Author", x="MessageCount", color='Author', orientation="h",
color_discrete_sequence=["black", "brown", "rosybrown", "darkorange", "darkgoldenrod"],
title="Number of Messages Sent Per User"
)
fig.show()
text = " ".join(review for review in messages_df.Message)
print ("There are {} words in all the messages.".format(len(text)))
text = " ".join(review for review in messages_df.Message)
stopwords = set(STOPWORDS)
stopwords.update(["ra", "ga", "na", "ani", "em", "ki", "ah","ha","la","eh","ne","le","ni","lo","Ma","Haa","ni", "u",
"us", "guy", "guys", "will", "now", "know"])
# Generate a word cloud image
wordcloud = WordCloud(stopwords=stopwords, colormap="Oranges").generate(text)
# Display the generated image:
# the matplotlib way:
plt.figure( figsize=(10,5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
#plt.savefig("cloud.jpg")
l = messages_df.Author.unique()
for i in range(len(l)):
try:
dummy_df = messages_df[messages_df['Author'] == l[i]]
text = " ".join(review for review in dummy_df.Message)
stopwords = set(STOPWORDS)
stopwords.update(["ra", "ga", "na", "ani", "em", "ki", "ah","ha","la","eh","ne","le","ni","lo","Ma","Haa","ni", "u",
"us", "guy", "guys", "will", "now", "know"])
# Generate a word cloud image
print('Author name',l[i])
wordcloud = WordCloud(stopwords=stopwords, colormap="Oranges").generate(text)
# Display the generated image:
# the matplotlib way:
plt.figure( figsize=(10,5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
except Exception:
pass
timeChat = messages_df['Time'].value_counts().nlargest(15).rename_axis("Time").reset_index(name='Count')
fig = px.bar(timeChat, y="Time", x="Count", color='Time', orientation="h",
color_discrete_sequence=["black", "brown", "rosybrown", "darkorange", "darkgoldenrod"],
title="Most Active Time of the Day"
)
fig.show()
activeHour = messages_df.groupby(['Hour'])['Hour'].count().nlargest(15).rename_axis("Hour").reset_index(name='Count')
fig = px.bar(activeHour, y="Hour", x="Count", color='Hour', orientation="h",
color_discrete_sequence=["black", "brown", "rosybrown", "darkorange", "darkgoldenrod"],
title="Most Active Hour of the Day"
)
fig.show()